{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10, 8)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Load the first 10 reviews\n",
    "f = open('data/yelp/v6/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json')\n",
    "js = []\n",
    "for i in range(10):\n",
    "    js.append(json.loads(f.readline()))\n",
    "f.close()\n",
    "review_df = pd.DataFrame(js)\n",
    "review_df.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Using spacy: [Installation instructions for spacy](https://spacy.io/docs/usage/)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import spacy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "    \u001b[93mInfo about model en\u001b[0m\n",
      "\n",
      "    link               //anaconda/envs/pharoah-eagle-owl/lib/python3.5/site-packages/spacy/data/en\n",
      "    lang               en             \n",
      "    name               core_web_sm    \n",
      "    author             Explosion AI   \n",
      "    description        Small English model for spaCy. Includes vocabulary, syntax, entities and word vectors (GloVe).\n",
      "    source             /anaconda/envs/pharoah-eagle-owl/lib/python3.5/site-packages/en_core_web_sm/en_core_web_sm-1.2.0\n",
      "    email              contact@explosion.ai\n",
      "    license            CC BY-SA 3.0   \n",
      "    spacy_version      >=1.7.0,<2.0.0 \n",
      "    version            1.2.0          \n",
      "    url                https://explosion.ai\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# model meta data\n",
    "spacy.info('en')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# preload the language model\n",
    "nlp = spacy.load('en')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pandas.core.series.Series"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Keeping it in a pandas dataframe\n",
    "doc_df = review_df['text'].apply(nlp)\n",
    "\n",
    "type(doc_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "spacy.tokens.doc.Doc"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(doc_df[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Got a letter in the mail last week that said Dr. Goldberg is moving to Arizona to take a new position there in June.  He will be missed very much.  \n",
       "\n",
       "I think finding a new doctor in NYC that you actually like might almost be as awful as trying to find a date!"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "doc_df[4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Got VERB VBP\n",
      "a DET DT\n",
      "letter NOUN NN\n",
      "in ADP IN\n",
      "the DET DT\n",
      "mail NOUN NN\n",
      "last ADJ JJ\n",
      "week NOUN NN\n",
      "that ADJ WDT\n",
      "said VERB VBD\n",
      "Dr. PROPN NNP\n",
      "Goldberg PROPN NNP\n",
      "is VERB VBZ\n",
      "moving VERB VBG\n",
      "to ADP IN\n",
      "Arizona PROPN NNP\n",
      "to PART TO\n",
      "take VERB VB\n",
      "a DET DT\n",
      "new ADJ JJ\n",
      "position NOUN NN\n",
      "there ADV RB\n",
      "in ADP IN\n",
      "June PROPN NNP\n",
      ". PUNCT .\n",
      "  SPACE SP\n",
      "He PRON PRP\n",
      "will VERB MD\n",
      "be VERB VB\n",
      "missed VERB VBN\n",
      "very ADV RB\n",
      "much ADV RB\n",
      ". PUNCT .\n",
      " \n",
      "\n",
      " SPACE SP\n",
      "I PRON PRP\n",
      "think VERB VBP\n",
      "finding VERB VBG\n",
      "a DET DT\n",
      "new ADJ JJ\n",
      "doctor NOUN NN\n",
      "in ADP IN\n",
      "NYC PROPN NNP\n",
      "that ADP IN\n",
      "you PRON PRP\n",
      "actually ADV RB\n",
      "like INTJ UH\n",
      "might VERB MD\n",
      "almost ADV RB\n",
      "be VERB VB\n",
      "as ADV RB\n",
      "awful ADJ JJ\n",
      "as ADP IN\n",
      "trying VERB VBG\n",
      "to PART TO\n",
      "find VERB VB\n",
      "a DET DT\n",
      "date NOUN NN\n",
      "! PUNCT .\n"
     ]
    }
   ],
   "source": [
    "# spacy gives you both fine grained (.pos_) + coarse grained (.tag_) parts of speech    \n",
    "for doc in doc_df[4]:\n",
    "    print(doc.text, doc.pos_, doc.tag_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[a letter, the mail, Dr. Goldberg, Arizona, a new position, June, He, I, a new doctor, NYC, you, a date]\n"
     ]
    }
   ],
   "source": [
    "# spaCy also does noun chunking for us\n",
    "\n",
    "print([chunk for chunk in doc_df[4].noun_chunks])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Using [Textblob](https://textblob.readthedocs.io/en/dev/)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from textblob import TextBlob"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The default tagger in TextBlob uses the PatternTagger, the same as [pattern](https://www.clips.uantwerpen.be/pattern), which is fine for our example. To use the NLTK tagger, we can specify the pos_tagger when we call TextBlob. More [here](http://textblob.readthedocs.io/en/dev/advanced_usage.html#advanced)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pandas.core.series.Series"
      ]
     },
     "execution_count": 135,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "blob_df = review_df['text'].apply(TextBlob)\n",
    "\n",
    "type(blob_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 150,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "textblob.blob.TextBlob"
      ]
     },
     "execution_count": 150,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(blob_df[4])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 151,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('Got', 'NNP'),\n",
       " ('a', 'DT'),\n",
       " ('letter', 'NN'),\n",
       " ('in', 'IN'),\n",
       " ('the', 'DT'),\n",
       " ('mail', 'NN'),\n",
       " ('last', 'JJ'),\n",
       " ('week', 'NN'),\n",
       " ('that', 'WDT'),\n",
       " ('said', 'VBD'),\n",
       " ('Dr.', 'NNP'),\n",
       " ('Goldberg', 'NNP'),\n",
       " ('is', 'VBZ'),\n",
       " ('moving', 'VBG'),\n",
       " ('to', 'TO'),\n",
       " ('Arizona', 'NNP'),\n",
       " ('to', 'TO'),\n",
       " ('take', 'VB'),\n",
       " ('a', 'DT'),\n",
       " ('new', 'JJ'),\n",
       " ('position', 'NN'),\n",
       " ('there', 'RB'),\n",
       " ('in', 'IN'),\n",
       " ('June', 'NNP'),\n",
       " ('He', 'PRP'),\n",
       " ('will', 'MD'),\n",
       " ('be', 'VB'),\n",
       " ('missed', 'VBN'),\n",
       " ('very', 'RB'),\n",
       " ('much', 'JJ'),\n",
       " ('I', 'PRP'),\n",
       " ('think', 'VBP'),\n",
       " ('finding', 'VBG'),\n",
       " ('a', 'DT'),\n",
       " ('new', 'JJ'),\n",
       " ('doctor', 'NN'),\n",
       " ('in', 'IN'),\n",
       " ('NYC', 'NNP'),\n",
       " ('that', 'IN'),\n",
       " ('you', 'PRP'),\n",
       " ('actually', 'RB'),\n",
       " ('like', 'IN'),\n",
       " ('might', 'MD'),\n",
       " ('almost', 'RB'),\n",
       " ('be', 'VB'),\n",
       " ('as', 'RB'),\n",
       " ('awful', 'JJ'),\n",
       " ('as', 'IN'),\n",
       " ('trying', 'VBG'),\n",
       " ('to', 'TO'),\n",
       " ('find', 'VB'),\n",
       " ('a', 'DT'),\n",
       " ('date', 'NN')]"
      ]
     },
     "execution_count": 151,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "blob_df[4].tags"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['got', 'goldberg', 'arizona', 'new position', 'june', 'new doctor', 'nyc']\n"
     ]
    }
   ],
   "source": [
    "# blobs in TextBlob also have noun phrase extraction\n",
    "\n",
    "print([np for np in blob_df[4].noun_phrases])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
